# CREATE TABLE `proteins`.`seqres` (
#   `pdbname` char(6)  NOT NULL,
#   `chain` char(1)  NOT NULL,
#   `seqres` text ,
#   `precision` float ,
#   PRIMARY KEY (`pdbname`, `chain`)
# )
#-------------------------------------------------------------------
#sets up the connection to mysql database. $connection global var holds the handle
def connect_to_proteins_db
	
	if $in_intel
		$SCRIPTHOME = '/nfs/iil/proj/mpgarch/arch_xpool/snepomny/BCS/framework/stage_7/patriot_tools/script/'
		$: << "#{$SCRIPTHOME}/../rubylib"
		$: << "#{$SCRIPTHOME}/../app/models"
		$: << "/nfs/site/proj/dpg/arch/perfhome/rubylib"
		$: << "/nfs/site/proj/dpg/arch/perfhome/rubylib-ext/1.8"
		require 'rbconfig'
		$: << "/nfs/site/proj/dpg/arch/perfhome/rubylib-ext/1.8/#{Config::CONFIG['arch']}"
		require 'rubygems'
		require 'mysql'
		$connection  = Mysql::new("iinf012", "ykouniav", "password123", "patriot_test",3310)
	elsif $on_rachel_lx or $on_rachel2
		begin
		require "/usr/local/lib/site_ruby/1.8/mysql.rb"
		rescue
		    raise "Could not load mysql"
 		end
		begin
		$connection  = Mysql::new("localhost", "rsyncuser", "pdb4all", "rsync")
		rescue => err
		    raise "Could not load mysql: #{err}"
		end


	else
		require "mysql"
		$connection  = Mysql::new("localhost", "snepomny", "kolobok", "proteins")
 
	end
end
#-------------------------------------------------------------------
#------------------------------------------------------------------
def create_table_closeones
begin
$connection.query " CREATE TABLE `closeones` (
  `l_name` char(6) NOT NULL,
  `r_name` char(6) NOT NULL,
  `part` int(11) NOT NULL,
  `evalue` char(10) default NULL,
  `length` int(11) NOT NULL,
  `ident` int(10) unsigned NOT NULL,
  `positives` int(10) unsigned NOT NULL,
  `gaps` int(10) unsigned NOT NULL,
  `left_q` int(3) unsigned NOT NULL,
  `right_q` int(3) unsigned NOT NULL,
  `left_s` int(3) unsigned NOT NULL,
  `right_s` int(3) unsigned NOT NULL,
  `seq_q` text NOT NULL,
  `seq_s` text NOT NULL,
  `rmsd` float NOT NULL,
  `gdmt1` float NOT NULL,
  `gdmt2` float NOT NULL,
  `gdmt3` float NOT NULL,
  PRIMARY KEY  USING BTREE (`l_name`,`r_name`,`part`)
) ENGINE=MyISAM DEFAULT CHARSET=latin1
"
rescue => err
	log_me err
end
end
#------------------------------------------------------------------
def create_table_leaders_pairs
begin
$connection.query " CREATE TABLE `leaders_pairs` (
  `l_name` char(6) NOT NULL,
  `r_name` char(6) NOT NULL,
  `part` int(11) NOT NULL,
  `evalue` char(10) default NULL,
  `length` int(11) NOT NULL,
  `ident` int(10) unsigned NOT NULL,
  `positives` int(10) unsigned NOT NULL,
  `gaps` int(10) unsigned NOT NULL,
  `left_q` int(3) unsigned NOT NULL,
  `right_q` int(3) unsigned NOT NULL,
  `left_s` int(3) unsigned NOT NULL,
  `right_s` int(3) unsigned NOT NULL,
  `seq_q` text NOT NULL,
  `seq_s` text NOT NULL,
  `rmsd` float NOT NULL,
  `gdmt1` float NOT NULL,
  `gdmt2` float NOT NULL,
  `gdmt3` float NOT NULL,
  PRIMARY KEY  USING BTREE (`l_name`,`r_name`,`part`)
) ENGINE=MyISAM DEFAULT CHARSET=latin1
"
rescue => err
	log_me err
end
end
#-------------------------------------------------------------------
#-------------------------------------------------------------------
def get_fasta_by_name(name)
	res =  $connection.query "select sequence from fasta_chains where name = '#{name}'"
	res.each do |r| 
		return r[0]
	end
end
#-------------------------------------------------------------------
def write_fasta_to_file(name)
	seq = get_fasta_by_name(name)
	f = File.new(name+'.seq',"w")
	f.puts ">"+name
	f.puts seq
	f.flush
	f.close
end
#keys_count - first columns are a composite key to table_name - how many of them?
def insert_or_update(table_name,columns_arr,values_arr,keys_count,update_on_fail=true)
	raise "columns_arr.size != values_arr.size" if columns_arr.size != values_arr.size
	#insert into table_name (column1, column2, ...) values (value1, value2 ...);
	#UPDATE table_name SET column1=value, column2=value2,... WHERE some_column=some_value;
        q_insert = "insert into #{table_name} (#{columns_arr.join(", ")}) values (#{values_arr.join(",")})" 
	begin
		$connection.query q_insert
	rescue => err
		#err_log "insert_failed with #{err}" 
		where_clause = []
		set_clause = []
		(0..keys_count).each do |i|
			where_clause << "#{columns_arr[i]}=#{values_arr[i]}"
		end
		(keys_count..columns_arr.size-1).each do |i|
			set_clause << "#{columns_arr[i]}=#{values_arr[i]}"
		end
		begin
			$connection.query "update #{table_name} set #{set_clause.join(", ")} where #{where_clause.join(" and ")}" if update_on_fail
		rescue => err
			err_log "update #{table_name} set #{set_clause.join(", ")} where #{where_clause.join(" and ")}"
		end
	end
end
#-------------------------------------------------------------------
def insert_or_update_seqres(name, chain, seqres, precision)
	insert_or_update("seqres", ['pdbname', 'chain', 'seqres','prec'], ["'#{name}'","'#{chain}'","'#{seqres}'","#{precision}"],1)
end

#-------------------------------------------------------------------
# def convert_from_fasta_name_to_filename name
# 	#101m_A => pdb101m.ent (and .gz if gzipped)
# 	return [ "pdb" + name.split("_")[0] + ".ent", name.split("_")[1].upcase]
# end
#-------------------------------------------------------------------
def get_some_chain_names(limit=123456)
	query_string = "select l_name from chain_pairs  group by l_name limit #{limit.to_s};"
	res = $connection.query query_string
	results_arr = []
	res.each do |line|
		results_arr << ( convert_from_fasta_name_to_filename line[0])
	end
	return results_arr
end
#-------------------------------------------------------------------
def chains_iterate(where_clause='', limit=99123456, &block)
	query_string = "select pdbname,chain,seqres from seqres #{where_clause} limit #{limit.to_s};"
	res = $connection.query query_string
	res.each do |line|
		# name,chain,sequence
		yield line[0],line[1],line[2]
	end
end
#-------------------------------------------------------------------
def get_blast_pairs_count
	query_string = "select count(*) from blast_pairs"
	res = $connection.query query_string
	res.each do |line|
		return line[0].to_i
	end
end
#-------------------------------------------------------------------
def update_len_in_pairs(l,r,len)
	 #UPDATE table_name SET column1=value, column2=value2,... WHERE some_column=some_value;
	$connection.query "update blast_pairs set length=#{len} where l_name='#{l}' and r_name='#{r}'" 

end
#-------------------------------------------------------------------
def full_blast_iterator
	limit=0
	blast_pairs_count=get_blast_pairs_count   
while limit<blast_pairs_count do
	query_string = "select * from blast_pairs limit #{limit},100000"
	res = $connection.query query_string
	res.each do |line|
#l_name,r_name,part,evalue,length,ident,positives,gaps,left_q,right_q,left_s,right_s,seq_q,seq_s,rmsd,gdmt1,gdmt2,gdmt3
		yield line[0],line[1],line[2],line[3],line[4],line[5],line[6],line[7],line[8],line[9],line[10],line[11],line[12],line[13],line[14],line[15],line[16],line[17],line[18]
	end
	limit +=100000
end #while
end

def blast_hits_iterate(limit=99123456, where_clause='', &block)
	query_string = "select l_name,r_name,left_q,left_s,seq_q,seq_s from blast_pairs #{where_clause} limit #{limit.to_s};"
	res = $connection.query query_string
	res.each do |line|
		# l_name,r_name,left_q,left_s,seq_q,seq_s
		yield line[0],line[1],line[2],line[3],line[4],line[5]
	end
end
#-------------------------------------------------------------------
#will take hit_hash and insert the data in DB
def insert_pair(query_name,hit_hashs_arr)
 	counter = -1
 	
# 	hit_hash = hit_hashs_arr[0]
#	pair = [query_name.strip,hit_hash[:r_name].strip].sort #sort lex. the names of chains
# 	puts hit_hashs_arr.size.to_s if pair[0]==pair[1] #
#  	return	
# 	if hit_hashs_arr.size > 1 
# 		
# 		
# 		
# # 		h0 = hit_hashs_arr[0]
# # 		h =  hit_hashs_arr[1]
# # 		
# # 		if ( (h[:left_q].to_i < h0[:right_q].to_i or h[:left_q].to_i < h0[:right_s].to_i) or (h[:left_s].to_i < h0[:right_q].to_i or h[:left_s].to_i < h0[:right_s].to_i)  )
# # 			puts "#{query_name}" # - #{h[:r_name]}"
# # 		end
# 		return
# 	end
# 	return
  	hit_hashs_arr.each do |hit_hash|
#  	hit_hash = hit_hashs_arr[0]
  		pair = [query_name,hit_hash[:r_name]].sort #sort lex. the names of chains
		return if pair[0]==pair[1] #we don't want self-similarity

		left_q = hit_hash[:left_q]
		right_q = hit_hash[:right_q]
		left_s = hit_hash[:left_s]
		right_s = hit_hash[:right_s]
		seq_q = hit_hash[:seq_q]
		seq_s = hit_hash[:seq_s]
		#but, alas, the sorting creates the following problem - we maybe have to swap q and s data:
		if query_name==pair[1] #swap
			left_q = hit_hash[:left_s]
			right_q = hit_hash[:right_s]
			left_s = hit_hash[:left_q]
			right_s = hit_hash[:right_q]
			seq_q = hit_hash[:seq_s]
			seq_s = hit_hash[:seq_q]
		end
	
  		counter = counter+1
		insert_or_update("blast_pairs", ['l_name', 'r_name', 'part','evalue','length','ident','positives','gaps','left_q','right_q','left_s','right_s','seq_q','seq_s','rmsd','gdmt1','gdmt2','gdmt3','tmscore'], ["'#{pair[0]}'","'#{pair[1]}'","#{counter.to_s}","'#{hit_hash[:evalue]}'","#{hit_hash[:length].to_s}","#{hit_hash[:ident].to_s}", "#{hit_hash[:positives].to_s}","#{hit_hash[:gaps].to_s}","#{left_q}","#{right_q}","#{left_s}","#{right_s}","'#{seq_q}'", "'#{seq_s}'","#{hit_hash[:rmsd]}","#{hit_hash[:gdmt1]}","#{hit_hash[:gdmt2]}","#{hit_hash[:gdmt3]}","#{hit_hash[:tmscore]}" ],1,false)
			
#  			q_string = "insert into blast_pairs (l_name, r_name, part, evalue, length, ident, positives, gaps, left_q, right_q, left_s, right_s, seq_q, seq_s) values ('#{pair[0]}','#{pair[1]}',#{counter.to_s},#{hit_hash[:evalue]},#{hit_hash[:length]},#{hit_hash[:ident]}, #{hit_hash[:positives]}, #{hit_hash[:gaps]},#{left_q},#{right_q},#{left_s},#{right_s}, '#{seq_q}', '#{seq_s}' )"

 	end
end
#-------------------------------------------------------------------
def dump_blast_hits_to_file(fname)
	f = File.new(fname,'w')
	counter = 0
	blast_hits_iterate() { |l_name,r_name,blast_offset_q,blast_offset_s,blast_q,blast_s|
		f.puts "#{l_name} #{r_name} #{blast_offset_q} #{blast_offset_s} #{blast_q} #{blast_s}"
		puts counter if 1 == 1001 % counter+=1
	}
	f.flush
	f.close
end
#-------------------------------------------------------------------
def dump_blast_as_flat_to_file(fname='')
#flat is:
#l_name r_name eval length ident pos gaps left_s right_s left_q right_q seq_s seq_q 
query_string = "select l_name,r_name,evalue,length,ident,positives,gaps,left_s,right_s,left_q,right_q,seq_q,seq_q from blast_pairs where length>999;"
	res = $connection.query query_string
	res.each do |line|
		# l_name,r_name,left_q,left_s,seq_q,seq_s
		puts line.join(',')
	end

end
#-------------------------------------------------------------------
def closeones_iterate(where_clause='', limit=99123456, &block)
	query_string = "select l_name,r_name from closeones #{where_clause} limit #{limit.to_s};"
	res = $connection.query query_string
	res.each do |line|
		# name,chain,sequence
		yield line[0],line[1]
	end
end
#-------------------------------------------------------------------
#
#  Functions for clustering
#
#-------------------------------------------------------------------
def get_closeones(chain, ident, rmsd)
	query = "select  l_name,r_name from blast_pairs where (l_name like '#{chain}%' or r_name like '#{chain}%') and ident > #{ident} and rmsd < #{rmsd} order by ident desc"
	res = $connection.query query
	arr = []
	arr << chain #itself is the closest
	res.each do |row|
		arr << ((row[0]==chain) ? row[1] : row[0])
		
	end
	
	return arr
end
#-------------------------------------------------------------------
def get_leaders()
	query = "select  distinct leader as l,',' from clusters"
	res = $connection.query query
	arr = {}
	res.each do |row|
		arr[row[0]]=1
	end
	
	return arr	
end
#-------------------------------------------------------------------
def get_closeones_strict(chain)
	query = "select  l_name,r_name from closeones where (l_name='#{chain}' or r_name='#{chain}') order by ident desc"
	res = $connection.query query
	arr = []
	arr << chain #itself is the closest
	res.each do |row|
		arr << ((row[0]==chain) ? row[1] : row[0])
		
	end
	
	return arr
end
#-------------------------------------------------------------------	
def select_leader_from_relatives(arr)
	return arr[0]
end
#-------------------------------------------------------------------
def update_clusters_pair(chain,leader)
	insert_or_update("clusters", ['chain', 'leader'], ["'#{chain}'","'#{leader}'"],0)
end
#-------------------------------------------------------------------
def build_clusters!(ident=99,rmsd=1)

	chain_leader = {}
	counter = 0
	chains_iterate { |pdb,chain|
		counter += 1
		log_me counter.to_s if counter%1024 == 0
 		next if chain_leader[pdb+chain]
 		arr = get_closeones_strict(pdb+chain)
 		leader = select_leader_from_relatives(arr)
 		arr.each do |a|
 			chain_leader[a] = leader
 		end
 	}
	
	$connection.query "delete from clusters"
	chain_leader.each_pair { |c,l|
		update_clusters_pair(c,l)
	}
	
	
end
#-------------------------------------------------------------------
#make dundant set
def reduce_blast_pairs_to_leaders
	hsh = get_leaders
        cnt = 0
	full_blast_iterator { |l_name,r_name,part,evalue,length,ident,positives,gaps,left_q,right_q,left_s,right_s,seq_q,seq_s,rmsd,gdmt1,gdmt2,gdmt3,tmscore|
		cnt += 1
		puts cnt.to_s if cnt%1000==0
		insert_or_update("leaders_pairs", ['l_name', 'r_name', 'part','evalue','length','ident','positives','gaps','left_q','right_q','seq_q','seq_s','rmsd','gdmt1','gdmt2','gdmt3','tmscore'], ["'#{l_name}'","'#{r_name}'",part,"'#{evalue}'",length,ident,positives,gaps,left_q,right_q,"'#{seq_q}'","'#{seq_s}'",rmsd,gdmt1,gdmt2,gdmt3,tmscore],1) if ( hsh[r_name] and hsh[l_name])
	}
end	


#-------------------------------------------------------------------
def rebuild_db_after_blast_update
	$connection.query "delete from closeones"
	$connection.query "insert into closeones (select * from blast_pairs where ident>99 and rmsd<=1)"
	build_clusters!
	$connection.query "delete from leaders_pairs"
	reduce_blast_pairs_to_leaders
end
#-------------------------------------------------------------------
def set_rfactor_for_protein(name,rfactor)
	$connection.query "update seqres set rfactor=#{rfactor} where pdbname='#{name.downcase}'"
end
